HW 03

Author

Matt Osterhoudt

0 - Setup

knitr::opts_chunk$set(message = FALSE, warning = FALSE)

if (!require("pacman")) 
  install.packages("pacman")

# use this line for installing/loading
 pacman::p_load(tidyverse,
                palmerpenguins,
                here,
                cowgrid,
                dsbox,
                fs,
                janitor,
                scales,
                ggforce,
                glue,
                jpeg,
                png,
                grid,
                cowplot) 
install.packages("openintro")
package 'openintro' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\matto\AppData\Local\Temp\RtmpE9OGSW\downloaded_packages
install.packages("ggridges")
package 'ggridges' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
    C:\Users\matto\AppData\Local\Temp\RtmpE9OGSW\downloaded_packages
library(openintro)
library(ggridges)

devtools::install_github("tidyverse/dsbox")

ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

knitr::opts_chunk$set(
  fig.width = 7,
  fig.asp = .618,
  fig.retina = 3,
  fig.align = "center",
  dpi = 300
)

1 - Du Bois challenge.

# Reading in the data and pivoting the data to a longer format. This will help with the visualization.
income_data <- read_csv(here("data", "income.csv"))
income_data_longer <- pivot_longer(
  income_data,
  cols = c("Rent", "Food", "Clothes", "Tax", "Other"),
  names_to = "Category",
  values_to = "Percentage"
) 

# Modifies the Category order to match
income_data_longer$Category <- factor(income_data_longer$Category,
                                      levels = c("Other", "Tax", "Clothes",
                                                 "Food", "Rent"))

y_axis_relabel <- c(
  "$100-200" = "$100-300 ($139.1)",
  "$200-300" = "$200-300 ($249.45)",
  "$300-400" = "$300-400 ($335.66)",
  "$400-500" = "$400-500 ($433.82)",
  "$500-750" = "$500-$700 ($547)",
  "$750-1000" = "$750-1000 ($880)",
  "$1000 AND OVER" =  "$1000\nAND OVER ($1125)"
)

# Adds the background. 
background_img <- readJPEG(here("images", "parchment_paper.jpg"))


ggplot(income_data_longer, aes(x = Percentage, y = Class, fill = Category)) +
  annotation_raster(background_img, xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf) +
  geom_bar(stat = "identity", width = 0.4) +
  scale_y_discrete(limits = rev(c(
    "$100-200", "$200-300", "$300-400", "$400-500",
    "$500-750", "$750-1000", "$1000 AND OVER"
  )),labels = y_axis_relabel) +
  scale_fill_manual(values = c(
    "Rent" = "black",
    "Food" = "purple",
    "Clothes" = "red",
    "Tax" = "blue",
    "Other" = "gray"
  )) + 
  geom_text(
    aes(label = case_when(Percentage < 1 ~ "", TRUE ~ paste0(Percentage, "%"))),
    color = "white",
    fontface = "bold",
    size = 2.5,
    position = position_stack(vjust = 0.5)
    ) +
  labs(
    y = NULL,
    x = NULL
  ) + 
  theme_minimal() + 
  coord_cartesian(clip = "off") +
  scale_x_continuous(limits = c(0, 100), expand = c(0,0)) + 
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(hjust = 0),
    axis.title.y = element_text(angle = 0, size = 10, hjust = -2),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.title = element_blank(),
    legend.direction = "horizontal"
  ) +
  annotate("text",
           x = 0,
           y = 7.5,
           label = "Class (Actual Average)",
           hjust = 1,
           size = 3)

#I ran into the issue of the background not covering everything, so I used cowplot to help.

I used https://www.geeksforgeeks.org/r-language/how-to-add-image-to-ggplot2-in-r/ to help figure out how to add the background image.

2 - COVID survey - interpret

This plot is a visual of Covid survey responses regarding attitudes toward COVID-19 vaccines. The x-axis shows the Likert scale from 1-5 for each question, 1 being most agreeable and 5 being least agreeable. The y-axis contains all of the different groups categorized by varying demographics (age, gender, profession, etc.). I do agree with some of these results intuitively. For example, I expected that anyone with a background in medicine would view the vaccine in a positive way. As expected, those in the medical profession ranked many of the questions on the Likert scale closer to a 1 or a 2. I was a little surprised to see that, when comparing medical and nursing results on the question “I will recommend the vaccine…”, that there was a bit more variance on the medical profession. I expected the medical profession and nursing profession to be very very close in scoring. I also expected that the younger generation would rate the vaccine as safer and higher confidence, which turned out to be true in the data, especially compared to older ages.

3 - COVID survey - reconstruct

raw_data <- read_csv(here("data", "covid-survey.csv"), skip = 1)
view(raw_data)

clean_raw_data <- raw_data |>
  filter(!if_all(-response_id, is.na))

view(clean_raw_data)

relabel_data <- clean_raw_data |>
  mutate(
    exp_already_vax = case_when(exp_already_vax == 0 ~ "No",  
                                exp_already_vax == 1 ~ "Yes", TRUE ~ NA_character_),
    exp_flu_vax = case_when(exp_flu_vax == 0 ~ "No", 
                            exp_flu_vax == 1 ~ "Yes", TRUE ~ NA_character_),
    exp_profession = case_when(exp_profession == 0 ~ "Medical", 
                               exp_profession == 1 ~ "Nursing", TRUE ~ NA_character_),
    exp_gender = case_when(exp_gender == 0 ~ "Male", exp_gender == 1 ~ "Female", 
                           exp_gender == 3 ~ "Non-binary third gender", exp_gender == 4 ~ "Prefer not to say", 
                           TRUE ~ NA_character_),
    exp_race = case_when(exp_race == 1 ~ "American Indian / Alaskan Native",
                         exp_race == 2 ~ "Asian",
                         exp_race == 3 ~ "Black / African American",
                         exp_race == 4 ~ "Native Hawaiian / Other Pacific Islander",
                         exp_race == 5 ~ "White", TRUE ~ NA_character_),
    exp_ethnicity = case_when(exp_ethnicity == 1 ~ "Hispanic / Latino",
                              exp_ethnicity == 2 ~ "Non-Hispanic / Non-Latino", TRUE ~ NA_character_),
    exp_age_bin = case_when(
        exp_age_bin == 0 ~ "<20",
        exp_age_bin == 20 ~ "21-25",
        exp_age_bin == 25 ~ "26-30",
        exp_age_bin == 30 ~ ">30", TRUE ~ NA_character_),
  )
view(relabel_data)

# The first pivot takes all columns starting with exp_ and groups them into two new columns
# called explanatory and explanatory_value. Each explanatory variable is now its own row entry
# The second pivot now modifies the columns with "resp_" to do something similar. It groups the
# output of the first pivot into multiple rows based on resp.
covid_survey_longer <- relabel_data |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )


covid_survey_longer
# A tibble: 43,428 × 5
   response_id explanatory    explanatory_value response          response_value
         <dbl> <chr>          <chr>             <chr>                      <dbl>
 1           1 exp_profession Nursing           resp_safety                    5
 2           1 exp_profession Nursing           resp_confidence_…              2
 3           1 exp_profession Nursing           resp_concern_saf…              2
 4           1 exp_profession Nursing           resp_feel_safe_a…              1
 5           1 exp_profession Nursing           resp_will_recomm…              1
 6           1 exp_profession Nursing           resp_trust_info                1
 7           1 exp_flu_vax    Yes               resp_safety                    5
 8           1 exp_flu_vax    Yes               resp_confidence_…              2
 9           1 exp_flu_vax    Yes               resp_concern_saf…              2
10           1 exp_flu_vax    Yes               resp_feel_safe_a…              1
# ℹ 43,418 more rows
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarize(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.1, na.rm = TRUE),
    high = quantile(response_value, probs = 0.9, na.rm = TRUE)
  )
covid_survey_summary_stats_by_group
# A tibble: 126 × 6
# Groups:   explanatory, explanatory_value [21]
   explanatory explanatory_value response                 mean   low  high
   <chr>       <chr>             <chr>                   <dbl> <dbl> <dbl>
 1 exp_age_bin 21-25             resp_concern_safety      3.32     2     5
 2 exp_age_bin 21-25             resp_confidence_science  1.31     1     2
 3 exp_age_bin 21-25             resp_feel_safe_at_work   1.20     1     2
 4 exp_age_bin 21-25             resp_safety              1.95     1     5
 5 exp_age_bin 21-25             resp_trust_info          1.29     1     2
 6 exp_age_bin 21-25             resp_will_recommend      1.09     1     1
 7 exp_age_bin 26-30             resp_concern_safety      3.35     1     5
 8 exp_age_bin 26-30             resp_confidence_science  1.40     1     2
 9 exp_age_bin 26-30             resp_feel_safe_at_work   1.29     1     2
10 exp_age_bin 26-30             resp_safety              2.16     1     5
# ℹ 116 more rows
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarize(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.1, na.rm = TRUE),
    high = quantile(response_value, probs = 0.9, na.rm = TRUE)
  ) |>
  ungroup() |>
  mutate(
    explanatory = "All",
    explanatory_value = ""
  )

covid_survey_summary_stats_all
# A tibble: 6 × 6
  response                 mean   low  high explanatory explanatory_value
  <chr>                   <dbl> <dbl> <dbl> <chr>       <chr>            
1 resp_concern_safety      3.28     1     5 All         ""               
2 resp_confidence_science  1.43     1     2 All         ""               
3 resp_feel_safe_at_work   1.36     1     2 All         ""               
4 resp_safety              2.03     1     5 All         ""               
5 resp_trust_info          1.40     1     2 All         ""               
6 resp_will_recommend      1.21     1     2 All         ""               
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_by_group, covid_survey_summary_stats_all
)

covid_survey_summary_stats
# A tibble: 132 × 6
# Groups:   explanatory, explanatory_value [22]
   explanatory explanatory_value response                 mean   low  high
   <chr>       <chr>             <chr>                   <dbl> <dbl> <dbl>
 1 exp_age_bin 21-25             resp_concern_safety      3.32     2     5
 2 exp_age_bin 21-25             resp_confidence_science  1.31     1     2
 3 exp_age_bin 21-25             resp_feel_safe_at_work   1.20     1     2
 4 exp_age_bin 21-25             resp_safety              1.95     1     5
 5 exp_age_bin 21-25             resp_trust_info          1.29     1     2
 6 exp_age_bin 21-25             resp_will_recommend      1.09     1     1
 7 exp_age_bin 26-30             resp_concern_safety      3.35     1     5
 8 exp_age_bin 26-30             resp_confidence_science  1.40     1     2
 9 exp_age_bin 26-30             resp_feel_safe_at_work   1.29     1     2
10 exp_age_bin 26-30             resp_safety              2.16     1     5
# ℹ 122 more rows
response_labels <- c(
  "resp_safety" = "Based on my understanding,\nI believe the vaccine is safe",
  "resp_confidence_science" = "I am confident in the scientific\nvetting process for the new COVID vaccines",
  "resp_feel_safe_at_work" = "Getting the vaccine will\nmake me feel safer at work",
  "resp_will_recommend" = "I will recommend the vaccine\nto family, friends, and community members",
  "resp_trust_info" = "I trust the information\nthat I have received about the vaccines",
  "resp_concern_safety" = "I am concerned about the safety\nand side effects of the vaccine"
)

explanatory_labels <- c(
  "All"            = "All",
  "exp_age_bin"    = "Age",
  "exp_gender"     = "Gender",
  "exp_race"       = "Race",
  "exp_ethnicity"  = "Ethnicity",
  "exp_profession" = "Profession",
  "exp_already_vax"= "Had COVID vaccine",
  "exp_flu_vax"    = "Had flu vaccine this year"
)


# Plot code below

ggplot(covid_survey_summary_stats, aes(x = mean, y = explanatory_value)) + 
  geom_point(size = 0.7) +
  geom_errorbarh(
    aes(xmin = low, xmax = high),
    height = 0.3,
    size = 0.3
  ) + 
  facet_grid(explanatory ~ response,
             scales = "free_y",
             space = "free_y",
             labeller = labeller(
               explanatory = as_labeller(explanatory_labels, label_wrap_gen(15)),
               response = as_labeller(response_labels, label_wrap_gen(20))
             )) + 
  labs(
    x = "Mean Likert score\n(Error bars range from 10th to 90th percentile)",
    y = NULL
  ) +
  theme(
    strip.background = element_rect(fill = "gray90", color = "black", size = 0.1),
    axis.text.y = element_text(size = 5),
    strip.text.y = element_text(size = 5, angle = 0),
    strip.text.x = element_text(size = 5),
    panel.spacing = unit(.1, "lines"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_text(size = 8),
    axis.text.x = element_text(size = 6)
  )

4 - COVID survey - re-reconstruct

covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarize(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.25, na.rm = TRUE),
    high = quantile(response_value, probs = 0.75, na.rm = TRUE)
  )

covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarize(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.25, na.rm = TRUE),
    high = quantile(response_value, probs = 0.75, na.rm = TRUE)
  ) |>
  ungroup() |>
  mutate(
    explanatory = "All",
    explanatory_value = ""
  )

covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_by_group, covid_survey_summary_stats_all
)

# Plot code below
ggplot(covid_survey_summary_stats, aes(x = mean, y = explanatory_value)) + 
  geom_point(size = 0.7) +
  geom_errorbarh(
    aes(xmin = low, xmax = high),
    height = 0.3,
    size = 0.3
  ) + 
  facet_grid(explanatory ~ response,
             scales = "free_y",
             space = "free_y",
             labeller = labeller(
               explanatory = as_labeller(explanatory_labels, label_wrap_gen(15)),
               response = as_labeller(response_labels, label_wrap_gen(20))
             )) + 
  labs(
    x = "Mean Likert score\n(Error bars range from 25th to 75th percentile)",
    y = NULL
  ) +
  theme(
    strip.background = element_rect(fill = "gray90", color = "black", size = 0.1),
    axis.text.y = element_text(size = 5),
    strip.text.y = element_text(size = 5, angle = 0),
    strip.text.x = element_text(size = 5),
    panel.spacing = unit(.1, "lines"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.title.x = element_text(size = 8),
    axis.text.x = element_text(size = 6)
  )

5 - COVID survey - another view